notebook.community

Edit and run



In [117]:

    
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from __future__ import print_function
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

%matplotlib inline
%config InlineBackend.figure_format = 'png'
pd.set_option("max_columns",50)



In [2]:



In [3]:

    
%%time
df = pd.read_csv("../data/train_2013.csv", index_col=0)









    



Wall time: 39 s



In [4]:

    
df["date_time"] = pd.to_datetime(df["date_time"], errors="coerce")



In [6]:

    
# %%time
# skip_col = ["date_time","orig_destination_distance"]
# for col in df_1.columns:
#     if col == skip_col:
#         pass
#     print(col, np.unique(df_1[col].astype(str)))



In [35]:

    
# check in / check out / distance  => nan값 존재



In [9]:

    
%%time
df = df.reset_index(drop=True)
# 10000명의 데이터만 사용
df = df.ix[:9999]









    



Wall time: 1.87 s



In [11]:

    
df.to_csv("train_2013_10000.csv")



In [36]:

    
df.columns









    Out[36]:





Index([u'date_time', u'site_name', u'posa_continent', u'user_location_country',
       u'user_location_region', u'user_location_city',
       u'orig_destination_distance', u'user_id', u'is_mobile', u'is_package',
       u'channel', u'srch_ci', u'srch_co', u'srch_adults_cnt',
       u'srch_children_cnt', u'srch_rm_cnt', u'srch_destination_id',
       u'srch_destination_type_id', u'is_booking', u'cnt', u'hotel_continent',
       u'hotel_country', u'hotel_market', u'hotel_cluster'],
      dtype='object')



In [37]:

    
cols = df.columns.tolist()[-6:] + df.columns.tolist()[:-6]
df = df[cols]



In [46]:

    
cols = df.columns.tolist()[:1] + df.columns.tolist()[6:] + df.columns.tolist()[1:6] 
df = df[cols]



In [48]:

    
# 제거할 feature 생각해보기



In [53]:

    
df.head()









    Out[53]:






  
    
      
      is_booking
      date_time
      site_name
      posa_continent
      user_location_country
      user_location_region
      user_location_city
      orig_destination_distance
      user_id
      is_mobile
      is_package
      channel
      srch_ci
      srch_co
      srch_adults_cnt
      srch_children_cnt
      srch_rm_cnt
      srch_destination_id
      srch_destination_type_id
      cnt
      hotel_continent
      hotel_country
      hotel_market
      hotel_cluster
    
  
  
    
      0
      0
      2013-06-15 15:10:49
      30
      4
      195
      548
      56440
      NaN
      1048
      0
      1
      9
      2013-09-07
      2013-09-15
      2
      0
      1
      1385
      1
      1
      0
      185
      185
      58
    
    
      1
      1
      2013-06-15 15:38:05
      30
      4
      195
      548
      56440
      NaN
      1048
      0
      1
      9
      2013-09-06
      2013-09-14
      2
      0
      1
      1385
      1
      1
      0
      185
      185
      58
    
    
      2
      0
      2013-02-15 13:18:43
      2
      3
      66
      462
      41898
      2716.6746
      1482
      0
      0
      1
      2013-02-24
      2013-03-01
      2
      0
      1
      8857
      1
      1
      2
      50
      214
      28
    
    
      3
      0
      2013-02-16 11:57:50
      2
      3
      66
      462
      41898
      2716.5257
      1482
      0
      0
      0
      2013-02-24
      2013-03-01
      2
      0
      1
      8857
      1
      1
      2
      50
      214
      73
    
    
      4
      0
      2013-02-16 12:03:45
      2
      3
      66
      462
      41898
      2722.4856
      1482
      0
      0
      0
      2013-02-24
      2013-03-01
      2
      0
      1
      8857
      1
      1
      2
      50
      214
      26



In [49]:

    
df.columns
#









    Out[49]:





Index([u'is_booking', u'date_time', u'site_name', u'posa_continent',
       u'user_location_country', u'user_location_region',
       u'user_location_city', u'orig_destination_distance', u'user_id',
       u'is_mobile', u'is_package', u'channel', u'srch_ci', u'srch_co',
       u'srch_adults_cnt', u'srch_children_cnt', u'srch_rm_cnt',
       u'srch_destination_id', u'srch_destination_type_id', u'cnt',
       u'hotel_continent', u'hotel_country', u'hotel_market',
       u'hotel_cluster'],
      dtype='object')



In [68]:

    
delete_list = ["user_location_city", "user_location_region","is_mobile","is_package","hotel_country","hotel_market"]



In [70]:

    
df = df.drop(delete_list, axis=1)



In [75]:

    
print(df.columns, len(df.columns))









    



Index([u'is_booking', u'date_time', u'site_name', u'posa_continent',
       u'user_location_country', u'orig_destination_distance', u'user_id',
       u'channel', u'srch_ci', u'srch_co', u'srch_adults_cnt',
       u'srch_children_cnt', u'srch_rm_cnt', u'srch_destination_id',
       u'srch_destination_type_id', u'cnt', u'hotel_continent',
       u'hotel_cluster'],
      dtype='object') 18



In [84]:

    
df = df.drop(["posa_continent","orig_destination_distance", "srch_destination_type_id"], axis=1)



In [83]:

    
for col in df.columns:
    if col == "date_time":
        continue
    print(df[col].value_counts())
# df["posa_continent"].value_counts()









    



0    8973
1    1027
Name: is_booking, dtype: int64
2     6498
37    1609
11     495
34     429
24     293
13     190
17     132
8       60
9       53
32      46
25      42
22      42
40      34
33      25
30      17
35      10
28       8
26       7
18       4
10       3
19       3
Name: site_name, dtype: int64
3    7459
1    2038
2     416
4      77
0      10
Name: posa_continent, dtype: int64
66     5960
69     1429
205     860
3       237
46      227
133     184
68      116
225      93
80       72
77       63
70       62
32       53
28       46
62       45
215      44
23       39
231      38
0        37
235      35
154      34
239      33
198      31
85       23
93       22
195      19
29       19
158      18
194      17
182      17
142      14
48       13
148      12
168       9
115       7
27        6
12        6
162       6
51        5
103       5
208       5
1         5
39        5
82        5
119       4
54        3
64        3
157       3
202       3
5         2
57        2
190       2
166       2
Name: user_location_country, dtype: int64
5615.1972    25
202.0282     18
5797.7663    13
99.3290      13
96.6554      12
63.1618      10
87.9549      10
1590.7968     9
1868.9003     9
444.0290      9
2381.9139     9
295.6522      9
394.6765      8
318.2518      8
1235.6843     8
188.5710      8
51.6725       7
4527.2234     7
192.3717      7
6595.6508     6
79.0066       6
1917.9193     6
4527.4575     6
4491.8270     6
16.9159       6
1657.3570     6
1036.2993     6
142.9752      6
5095.1241     6
1622.4278     6
             ..
2528.9976     1
2226.1895     1
62.0251       1
139.6310      1
267.9601      1
2298.5065     1
1265.1169     1
2407.0524     1
734.3049      1
229.9292      1
1578.9702     1
218.9363      1
34.6006       1
407.4950      1
3351.0581     1
625.4276      1
1236.5718     1
1778.1646     1
4314.3084     1
1975.4526     1
192.7684      1
191.8491      1
258.0471      1
27.6370       1
157.6479      1
5809.3784     1
8407.6806     1
16.7685       1
770.8005      1
369.4375      1
Name: orig_destination_distance, dtype: int64
70535     345
33803     228
94390     212
71855     166
69003     156
121433    154
50191     152
123225    142
122669    115
81357     110
85275      98
108285     96
76943      92
70340      91
78474      86
34019      85
90864      84
9616       77
72708      76
125389     72
82160      71
115418     69
134677     69
38878      67
106813     65
117339     65
112433     65
118142     64
41165      64
88429      63
         ... 
111660      1
99735       1
54981       1
88681       1
18287       1
131587      1
82601       1
7523        1
134530      1
78546       1
101223      1
64523       1
113534      1
87044       1
17692       1
93505       1
85175       1
54670       1
70929       1
76809       1
97827       1
70663       1
78773       1
77811       1
97028       1
6300        1
98150       1
75637       1
32458       1
88198       1
Name: user_id, dtype: int64
9     5800
0     1273
1     1030
2      688
3      470
5      396
4      260
7       55
6       15
8       12
10       1
Name: channel, dtype: int64
2013-05-04    131
2013-10-18     94
2013-10-05     80
2013-12-29     80
2013-05-07     76
2013-07-03     71
2013-09-13     69
2013-03-21     66
2013-08-15     64
2013-08-11     64
2013-11-30     63
2013-08-09     62
2013-08-29     62
2013-08-17     59
2013-08-30     58
2013-03-29     58
2013-10-25     58
2013-10-04     57
2013-09-10     57
2013-08-24     56
2013-07-05     54
2013-10-17     54
2013-08-13     53
2013-08-16     53
2013-10-26     52
2013-05-08     51
2013-07-24     50
2013-03-07     49
2013-03-14     49
2013-12-27     49
             ... 
2014-07-22      1
2014-03-01      1
2014-05-15      1
2014-06-23      1
2014-05-28      1
2014-05-22      1
2014-04-25      1
2014-07-14      1
2014-08-06      1
2014-03-13      1
2014-03-19      1
2014-05-30      1
2014-05-08      1
2014-08-11      1
2014-04-19      1
2014-07-02      1
2014-03-12      1
2014-04-09      1
2014-06-08      1
2014-05-14      1
2013-01-08      1
2014-05-04      1
2014-12-08      1
2014-09-03      1
2014-01-31      1
2014-05-18      1
2014-05-02      1
2014-03-14      1
2014-03-09      1
2014-06-04      1
Name: srch_ci, dtype: int64
2013-05-08    178
2013-10-20    127
2013-03-24    101
2013-12-31     93
2013-12-01     79
2013-09-15     75
2013-07-12     71
2013-11-09     66
2013-09-02     65
2013-07-07     63
2013-08-23     62
2013-08-21     62
2013-11-27     62
2013-11-28     62
2013-05-03     59
2013-03-16     59
2013-08-04     59
2013-03-07     59
2013-08-26     59
2013-09-22     57
2013-10-11     56
2013-12-29     55
2013-05-10     55
2013-09-14     55
2013-06-07     55
2013-09-26     55
2013-05-05     54
2013-05-09     54
2013-10-12     52
2013-08-09     51
             ... 
2013-07-24      1
2014-02-25      1
2014-10-03      1
2014-02-11      1
2014-08-09      1
2013-08-05      1
2014-04-25      1
2014-04-26      1
2014-12-14      1
2014-07-19      1
2014-06-03      1
2014-05-23      1
2013-01-26      1
2014-06-23      1
2013-02-11      1
2014-03-10      1
2014-06-01      1
2014-05-09      1
2014-06-06      1
2014-03-26      1
2014-06-24      1
2014-04-19      1
2014-07-09      1
2014-05-18      1
2014-09-05      1
2014-03-12      1
2014-06-12      1
2014-12-22      1
2014-06-19      1
2014-03-25      1
Name: srch_co, dtype: int64
2    6208
1    2456
4     707
3     418
5      98
6      58
8      32
0      17
9       4
7       2
Name: srch_adults_cnt, dtype: int64
0    7193
1    1845
2     711
3     233
4       7
5       6
9       5
Name: srch_children_cnt, dtype: int64
1    8955
2     794
3     124
5      41
4      37
8      32
6      12
7       5
Name: srch_rm_cnt, dtype: int64
8267     296
8250     279
8746     194
12206    138
8268     131
8791     108
11439     98
8279      91
8278      89
7635      87
8745      84
8230      83
8220      82
8260      75
44045     70
468       65
8788      62
12257     61
8253      58
8213      57
12264     57
8855      56
8862      52
9147      48
12190     48
8266      47
20225     47
11353     47
12191     46
12227     46
        ... 
3093       1
11977      1
3789       1
12215      1
10320      1
40522      1
28593      1
43240      1
11972      1
4287       1
41137      1
24282      1
22235      1
24298      1
24322      1
45147      1
12363      1
61193      1
20328      1
28585      1
28556      1
12193      1
6043       1
12153      1
6051       1
28481      1
7999       1
20400      1
14138      1
22525      1
Name: srch_destination_id, dtype: int64
1    5608
6    2667
3     894
5     409
4     403
8      19
Name: srch_destination_type_id, dtype: int64
1     7005
2     1653
3      669
4      330
5      149
6       69
7       48
8       23
9       16
11      14
10       6
14       5
13       4
12       3
16       3
15       1
17       1
23       1
Name: cnt, dtype: int64
2    5575
6    2219
4    1088
3     848
5     188
0      82
Name: hotel_continent, dtype: int64
91    306
48    258
41    249
64    216
25    195
42    172
10    171
16    161
95    157
97    153
65    153
50    150
46    144
21    143
68    140
30    137
18    137
47    137
37    132
70    130
83    129
59    128
6     128
98    128
5     127
58    126
9     122
2     119
1     118
72    116
     ... 
15     71
39     69
38     69
31     68
14     67
20     66
19     66
92     64
12     63
67     60
45     60
43     60
51     59
79     57
66     57
60     56
93     53
71     50
49     48
23     48
75     46
35     41
87     40
63     38
88     35
24     33
53     33
80     25
27     18
74      6
Name: hotel_cluster, dtype: int64



In [88]:

    
# 별다른 Feature Engineering을 하지 않고 제거해서 model을 돌려보자



In [100]:

    
df["srch_ci"] = pd.to_datetime(df["srch_ci"], errors="coerce")
df["srch_co"] = pd.to_datetime(df["srch_co"], errors="coerce")



In [118]:

    
le = preprocessing.LabelEncoder()



In [121]:

    
df["srch_ci"] = le.fit_transform(df["srch_ci"])
df["srch_co"] = le.fit_transform(df["srch_co"])



In [134]:

    
df["date_time"] = df["date_time"].dt.date
df["date_time"] = le.fit_transform(df["date_time"])



In [135]:

    
trn_x = df.ix[:,1:]
trn_y = df.ix[:,:1]



In [136]:

    
model = RandomForestClassifier(max_depth=3, n_jobs=-1, random_state=402)

y 를 잘못 설정해부렸던 case



In [137]:

    
model.fit(trn_x,trn_y)









    



C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:1: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  if __name__ == '__main__':






    Out[137]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=402, verbose=0, warm_start=False)



In [153]:

    
importances = model.feature_importances_

std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(trn_x.shape[1]):
#     print(indices[f])
    print("%d. feature %d %s (%f)" % (f + 1, indices[f], trn_x.columns[indices[f]], importances[indices[f]]))

plt.title("Feature importances")
plt.bar(range(trn_x.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(trn_x.shape[1]), indices)
plt.xlim([-1, trn_x.shape[1]])
plt.show()









    



Feature ranking:
1. feature 11 cnt (0.533188)
2. feature 6 srch_co (0.102094)
3. feature 7 srch_adults_cnt (0.091235)
4. feature 12 hotel_continent (0.050153)
5. feature 3 user_id (0.042140)
6. feature 4 channel (0.039411)
7. feature 8 srch_children_cnt (0.036964)
8. feature 10 srch_destination_id (0.032244)
9. feature 5 srch_ci (0.030716)
10. feature 9 srch_rm_cnt (0.021721)
11. feature 1 site_name (0.009355)
12. feature 13 hotel_cluster (0.007323)
13. feature 0 date_time (0.003447)
14. feature 2 user_location_country (0.000008)



In [ ]:

    
# 1. feature 11 cnt (0.533188)
# 2. feature 6 srch_co (0.102094)
# 3. feature 7 srch_adults_cnt (0.091235)
# 4. feature 12 hotel_continent (0.050153)
# 5. feature 3 user_id (0.042140)
# 6. feature 4 channel (0.039411)
# 7. feature 8 srch_children_cnt (0.036964)
# 8. feature 10 srch_destination_id (0.032244)
# 9. feature 5 srch_ci (0.030716)
# 10. feature 9 srch_rm_cnt (0.021721)
# 11. feature 1 site_name (0.009355)
# 12. feature 13 hotel_cluster (0.007323)
# 13. feature 0 date_time (0.003447)
# 14. feature 2 user_location_country (0.000008) => 제거



In [154]:

    
sub_ex = pd.read_csv("../sample_submission.csv")



In [164]:

    
sub_ex.head()









    Out[164]:






  
    
      
      id
      hotel_cluster
    
  
  
    
      0
      0
      99 1
    
    
      1
      1
      99 1
    
    
      2
      2
      99 1
    
    
      3
      3
      99 1
    
    
      4
      4
      99 1



In [158]:

    
trn_x.head()









    Out[158]:






  
    
      
      date_time
      site_name
      user_location_country
      user_id
      channel
      srch_ci
      srch_co
      srch_adults_cnt
      srch_children_cnt
      srch_rm_cnt
      srch_destination_id
      cnt
      hotel_continent
      hotel_cluster
    
  
  
    
      0
      159
      30
      195
      1048
      9
      239
      245
      2
      0
      1
      1385
      1
      0
      58
    
    
      1
      159
      30
      195
      1048
      9
      238
      244
      2
      0
      1
      1385
      1
      0
      58
    
    
      2
      39
      2
      66
      1482
      1
      44
      47
      2
      0
      1
      8857
      1
      2
      28
    
    
      3
      40
      2
      66
      1482
      0
      44
      47
      2
      0
      1
      8857
      1
      2
      73
    
    
      4
      40
      2
      66
      1482
      0
      44
      47
      2
      0
      1
      8857
      1
      2
      26



In [ ]:

    
# is_booking이 y라고 생각했는데 다시 생각해보니 hotel_cluster가 중요함



In [163]:

    
trn_x1 = df.ix[:,:-1]
trn_y1 = df.ix[:,-1:]

model.fit(trn_x1,trn_y1)

importances = model.feature_importances_

std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(trn_x1.shape[1]):
    print("%d. feature %d %s (%f)" % (f + 1, indices[f], trn_x1.columns[indices[f]], importances[indices[f]]))

plt.title("Feature importances")
plt.bar(range(trn_x1.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(trn_x1.shape[1]), indices)
plt.xlim([-1, trn_x1.shape[1]])
plt.show()
# 위로 10000개 잡고, 샘플링, 다시 나오나 보고 변한다면, 데이터가 흔들리는지 확인
# feature 샘플링.









    



C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:4: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().






    



Feature ranking:
1. feature 13 hotel_continent (0.364650)
2. feature 11 srch_destination_id (0.172174)
3. feature 2 site_name (0.159535)
4. feature 3 user_location_country (0.102945)
5. feature 4 user_id (0.077250)
6. feature 7 srch_co (0.027816)
7. feature 6 srch_ci (0.019617)
8. feature 10 srch_rm_cnt (0.017608)
9. feature 9 srch_children_cnt (0.017537)
10. feature 1 date_time (0.014128)
11. feature 8 srch_adults_cnt (0.011914)
12. feature 5 channel (0.010902)
13. feature 12 cnt (0.003925)
14. feature 0 is_booking (0.000000)



In [ ]:

    
# Feature ranking:
# 1. feature 13 hotel_continent (0.364650) 
# 2. feature 11 srch_destination_id (0.172174)
# 3. feature 2 site_name (0.159535)
# 4. feature 3 user_location_country (0.102945)
# 5. feature 4 user_id (0.077250)
# 6. feature 7 srch_co (0.027816)
# 7. feature 6 srch_ci (0.019617)
# 8. feature 10 srch_rm_cnt (0.017608)
# 9. feature 9 srch_children_cnt (0.017537)
# 10. feature 1 date_time (0.014128)
# 11. feature 8 srch_adults_cnt (0.011914)
# 12. feature 5 channel (0.010902)
# 13. feature 12 cnt (0.003925)
# 14. feature 0 is_booking (0.000000)


# co-ci 기간 변수 
# is_booking한 사람의

Feature engineering



In [179]:

    
df.head()









    Out[179]:






  
    
      
      is_booking
      date_time
      site_name
      user_location_country
      user_id
      channel
      srch_ci
      srch_co
      srch_adults_cnt
      srch_children_cnt
      srch_rm_cnt
      srch_destination_id
      cnt
      hotel_continent
      hotel_cluster
    
  
  
    
      0
      0
      159
      30
      195
      1048
      9
      239
      245
      2
      0
      1
      1385
      1
      0
      58
    
    
      1
      1
      159
      30
      195
      1048
      9
      238
      244
      2
      0
      1
      1385
      1
      0
      58
    
    
      2
      0
      39
      2
      66
      1482
      1
      44
      47
      2
      0
      1
      8857
      1
      2
      28
    
    
      3
      0
      40
      2
      66
      1482
      0
      44
      47
      2
      0
      1
      8857
      1
      2
      73
    
    
      4
      0
      40
      2
      66
      1482
      0
      44
      47
      2
      0
      1
      8857
      1
      2
      26

	is_booking	date_time	site_name	posa_continent	user_location_country	user_location_region	user_location_city	orig_destination_distance	user_id	is_package	channel	srch_ci	srch_co	srch_adults_cnt	srch_rm_cnt	srch_destination_id	srch_destination_type_id	cnt	hotel_continent	hotel_country	hotel_market	hotel_cluster
0	0	2013-06-15 15:10:49	30	4	195	548	56440	NaN	1048	1	9	2013-09-07	2013-09-15	2	1	1385	1	1	0	185	185	58
1	1	2013-06-15 15:38:05	30	4	195	548	56440	NaN	1048	1	9	2013-09-06	2013-09-14	2	1	1385	1	1	0	185	185	58
2	0	2013-02-15 13:18:43	2	3	66	462	41898	2716.6746	1482	0	1	2013-02-24	2013-03-01	2	1	8857	1	1	2	50	214	28
3	0	2013-02-16 11:57:50	2	3	66	462	41898	2716.5257	1482	0	0	2013-02-24	2013-03-01	2	1	8857	1	1	2	50	214	73
4	0	2013-02-16 12:03:45	2	3	66	462	41898	2722.4856	1482	0	0	2013-02-24	2013-03-01	2	1	8857	1	1	2	50	214	26